Review Midterm

Author

Hermann Junhao Fan

The questions will be the same as the sample exam, but the dataset is eggs_price data-set in lab4’s folder.

egg_data <- read.csv("eggs_price_2025.csv")
colnames(egg_data) <- c("Date", "Price")
plot_ly(egg_data, x = ~as.Date(Date)) %>%
  add_lines(y = ~Price, name = 'Egg prices (USD)', line = list(color = 'blue')) %>%
  layout(title = "Time series plot in Egg Prices",
         xaxis = list(title = "Date"),
         yaxis = list(title = "Egg Prices"))
ts_egg <- ts(egg_data$Price, 
                start = decimal_date(as.Date(min(egg_data$Date))), 
                frequency = 12)  #Monthly dataa


plot(ts_egg, main = "Time Series plot")

Checking for stationary

f1 <- ggAcf(ts_egg, 50) + 
  ggtitle("ACF plot without differencing") + 
  theme_minimal()

f1

The ACF decays slowly to 0, which is a typical sign of non-stationary. The ADF Test below indicating the same.

tseries::adf.test(ts_egg)

    Augmented Dickey-Fuller Test

data:  ts_egg
Dickey-Fuller = -2.7064, Lag order = 8, p-value = 0.2793
alternative hypothesis: stationary

Lag plot:

gglagplot(ts_egg, do.lines = FALSE) +
  ggtitle("Lag Plot of Original Egg Prices") +
  theme_minimal()

From lag 1-4, we are able to see a clear autocorrelation.The lag plot does not show a consistent of autocorrelation.

require(gridExtra)
Loading required package: gridExtra

Attaching package: 'gridExtra'
The following object is masked from 'package:dplyr':

    combine
plot1<-autoplot(ts_egg, main="without the log transformation") 
plot2<-autoplot(log(ts_egg), main="log transformed data") 

grid.arrange(plot1, plot2,nrow=2)

There is no such big difference, so we do not need log transformation in this case.

tseries::adf.test(ts_egg)

    Augmented Dickey-Fuller Test

data:  ts_egg
Dickey-Fuller = -2.7064, Lag order = 8, p-value = 0.2793
alternative hypothesis: stationary

Not stationary, so take the differencing

fig1 <- ggAcf(ts_egg, 50) +
  ggtitle("ACF of egg's price time series") + 
  theme_minimal()

fig1

Non-stationary

diff <- diff(ts_egg)
fig_diff1 <- ggAcf(diff, 50) + 
  ggtitle("ACF For differenced egg price") + 
  theme_minimal()

fig_diff1

tseries::adf.test(diff)
Warning in tseries::adf.test(diff): p-value smaller than printed p-value

    Augmented Dickey-Fuller Test

data:  diff
Dickey-Fuller = -8.3505, Lag order = 8, p-value = 0.01
alternative hypothesis: stationary

Now it is stationary, second differencing is not needed.

# now we have d = 1
# for q -> ACF, acf has 2 significant spikes, so maybe q = 0, 1, 2
fig_diff_P <- ggPacf(diff, 50) + 
  ggtitle("PACF for differenced data") + 
  theme_minimal()

fig_diff_P # in this case, p = 0, 1, 2

# Load necessary libraries
library(knitr)
library(kableExtra)

Attaching package: 'kableExtra'
The following object is masked from 'package:dplyr':

    group_rows
library(forecast)

# Create an empty matrix to store results
results_matrix <- matrix(rep(NA, 6 * 20), nrow = 20)

# Initialize index for matrix row
i <- 1

# Loop through ARIMA model parameters: d = 1,2; p = 0:4; q = 1,2
for (d in 1:2) {
  for (p in 0:2) {
    for (q in 0:2) {
      
      # Ensure 'ts_indeed' is a univariate time series by selecting the correct column
      model <- Arima(ts_egg, order = c(p, d, q), include.drift = TRUE)
      
      # Store model parameters and AIC/BIC/AICc values in matrix
      results_matrix[i, ] <- c(p, d, q, model$aic, model$bic, model$aicc)
      
      # Increment row index
      i <- i + 1
    }
  }
}
Warning in Arima(ts_egg, order = c(p, d, q), include.drift = TRUE): No drift
term fitted as the order of difference is 2 or more.
Warning in Arima(ts_egg, order = c(p, d, q), include.drift = TRUE): No drift
term fitted as the order of difference is 2 or more.
Warning in Arima(ts_egg, order = c(p, d, q), include.drift = TRUE): No drift
term fitted as the order of difference is 2 or more.
Warning in Arima(ts_egg, order = c(p, d, q), include.drift = TRUE): No drift
term fitted as the order of difference is 2 or more.
Warning in Arima(ts_egg, order = c(p, d, q), include.drift = TRUE): No drift
term fitted as the order of difference is 2 or more.
Warning in Arima(ts_egg, order = c(p, d, q), include.drift = TRUE): No drift
term fitted as the order of difference is 2 or more.
Warning in Arima(ts_egg, order = c(p, d, q), include.drift = TRUE): No drift
term fitted as the order of difference is 2 or more.
Warning in Arima(ts_egg, order = c(p, d, q), include.drift = TRUE): No drift
term fitted as the order of difference is 2 or more.
Warning in Arima(ts_egg, order = c(p, d, q), include.drift = TRUE): No drift
term fitted as the order of difference is 2 or more.
# Convert matrix to data frame
results_df <- as.data.frame(results_matrix)
colnames(results_df) <- c("p", "d", "q", "AIC", "BIC", "AICc")

# Find the row with the minimum AIC
highlight_row <- which.min(results_df$AIC)

# Generate kable table with highlighting for the row with the minimum AIC
knitr::kable(results_df, align = 'c', caption = "Comparison of ARIMA Models") %>%
  kable_styling(full_width = FALSE, position = "center") %>%
  row_spec(highlight_row, bold = TRUE, background = "#FFFF99")  # Highlight row in yellow
Comparison of ARIMA Models
p d q AIC BIC AICc
0 1 0 -676.7017 -668.1222 -676.6793
0 1 1 -678.7610 -665.8919 -678.7161
0 1 2 -681.5609 -664.4021 -681.4860
1 1 0 -679.5362 -666.6671 -679.4914
1 1 1 -681.2114 -664.0526 -681.1365
1 1 2 -682.4048 -660.9562 -682.2922
2 1 0 -682.5958 -665.4369 -682.5209
2 1 1 -681.4265 -659.9779 -681.3139
2 1 2 -689.1763 -663.4380 -689.0184
0 2 0 -364.6668 -360.3789 -364.6593
0 2 1 -668.1642 -659.5885 -668.1418
0 2 2 -670.3585 -657.4949 -670.3135
1 2 0 -521.5931 -513.0174 -521.5706
1 2 1 -671.1789 -658.3153 -671.1339
1 2 2 -673.0732 -655.9218 -672.9982
2 2 0 -573.8695 -561.0059 -573.8246
2 2 1 -674.4413 -657.2898 -674.3662
2 2 2 -673.3408 -651.9015 -673.2280
NA NA NA NA NA NA
NA NA NA NA NA NA

So far, model ARIMA(2, 1, 2) is the best.

auto.arima(ts_egg)
Series: ts_egg 
ARIMA(4,1,1)(0,0,2)[12] with drift 

Coefficients:
         ar1     ar2      ar3      ar4      ma1    sma1    sma2   drift
      0.9745  0.0595  -0.0021  -0.1609  -0.9473  0.1057  0.1498  0.0045
s.e.  0.0506  0.0609   0.0617   0.0459   0.0297  0.0489  0.0572  0.0028

sigma^2 = 0.0155:  log likelihood = 361.64
AIC=-705.27   AICc=-704.93   BIC=-666.66

The Auto arima saying the ARIMA(4, 1, 1) is the best.

# Loop aagin 


# Load necessary libraries
library(knitr)
library(kableExtra)
library(forecast)

# Create an empty matrix to store results
results_matrix <- matrix(rep(NA, 6 * 20), nrow = 20)

# Initialize index for matrix row
i <- 1

# Loop through ARIMA model parameters: d = 1,2; p = 0:4; q = 1,2
for (d in 1) {
  for (p in 0:4) {
    for (q in 0:2) {
      
      # Ensure 'ts_indeed' is a univariate time series by selecting the correct column
      model <- Arima(ts_egg, order = c(p, d, q), include.drift = TRUE)
      
      # Store model parameters and AIC/BIC/AICc values in matrix
      results_matrix[i, ] <- c(p, d, q, model$aic, model$bic, model$aicc)
      
      # Increment row index
      i <- i + 1
    }
  }
}

# Convert matrix to data frame
results_df <- as.data.frame(results_matrix)
colnames(results_df) <- c("p", "d", "q", "AIC", "BIC", "AICc")

# Find the row with the minimum AIC
highlight_row <- which.min(results_df$AIC)

# Generate kable table with highlighting for the row with the minimum AIC
knitr::kable(results_df, align = 'c', caption = "Comparison of ARIMA Models") %>%
  kable_styling(full_width = FALSE, position = "center") %>%
  row_spec(highlight_row, bold = TRUE, background = "#FFFF99")  # Highlight row in yellow
Comparison of ARIMA Models
p d q AIC BIC AICc
0 1 0 -676.7017 -668.1222 -676.6793
0 1 1 -678.7610 -665.8919 -678.7161
0 1 2 -681.5609 -664.4021 -681.4860
1 1 0 -679.5362 -666.6671 -679.4914
1 1 1 -681.2114 -664.0526 -681.1365
1 1 2 -682.4048 -660.9562 -682.2922
2 1 0 -682.5958 -665.4369 -682.5209
2 1 1 -681.4265 -659.9779 -681.3139
2 1 2 -689.1763 -663.4380 -689.0184
3 1 0 -682.9246 -661.4760 -682.8120
3 1 1 -681.6765 -655.9382 -681.5186
3 1 2 -690.9219 -660.8939 -690.7110
4 1 0 -683.1827 -657.4444 -683.0248
4 1 1 -695.9539 -665.9259 -695.7430
4 1 2 -694.0638 -659.7460 -693.7921
NA NA NA NA NA NA
NA NA NA NA NA NA
NA NA NA NA NA NA
NA NA NA NA NA NA
NA NA NA NA NA NA

ARIMA(4, 1, 2) is so far the best.

Diag:

Possible models:

ARIMA(4, 1, 1) -> Auto.arima ARIMA(2, 1, 2) -> First choice

model_output <- capture.output(sarima(ts_egg, 2, 1, 2))

# Find the line numbers dynamically based on a keyword
start_line <- grep("Coefficients", model_output)  # Locate where coefficient details start
end_line <- length(model_output)  # Last line of output

# Print the relevant section automatically
cat(model_output[start_line:end_line], sep = "\n")
Coefficients: 
         Estimate     SE  t.value p.value
ar1        0.7828 0.0584  13.4153  0.0000
ar2       -0.8377 0.0940  -8.9131  0.0000
ma1       -0.7742 0.0265 -29.2028  0.0000
ma2        0.9338 0.0827  11.2940  0.0000
constant   0.0061 0.0060   1.0224  0.3071

sigma^2 estimated as 0.0159264 on 534 degrees of freedom 
 
AIC = -1.27862  AICc = -1.278411  BIC = -1.230868 
 
model_output <- capture.output(sarima(ts_egg, 4, 1, 1))

# Find the line numbers dynamically based on a keyword
start_line <- grep("Coefficients", model_output)  # Locate where coefficient details start
end_line <- length(model_output)  # Last line of output

# Print the relevant section automatically
cat(model_output[start_line:end_line], sep = "\n")
Coefficients: 
         Estimate     SE  t.value p.value
ar1        0.9655 0.0554  17.4269  0.0000
ar2        0.0288 0.0604   0.4770  0.6336
ar3       -0.0007 0.0615  -0.0108  0.9914
ar4       -0.1464 0.0460  -3.1832  0.0015
ma1       -0.9158 0.0363 -25.2417  0.0000
constant   0.0049 0.0030   1.6049  0.1091

sigma^2 estimated as 0.01567457 on 533 degrees of freedom 
 
AIC = -1.291195  AICc = -1.290902  BIC = -1.235484 
 

Best model should be 2, 1, 2.

fit <- Arima(ts_egg, order = c(2, 1, 2), include.drift = FALSE)
summary(fit)
Series: ts_egg 
ARIMA(2,1,2) 

Coefficients:
         ar1      ar2      ma1     ma2
      0.7836  -0.8356  -0.7742  0.9322
s.e.  0.0617   0.1029   0.0270  0.0915

sigma^2 = 0.01608:  log likelihood = 350.07
AIC=-690.13   AICc=-690.02   BIC=-668.68

Training set error measures:
                      ME      RMSE       MAE        MPE     MAPE      MASE
Training set 0.005541111 0.1262059 0.0776439 0.04752214 5.256716 0.3163514
                   ACF1
Training set 0.06016651
# Forecast the next 365 periods
forecast_result <- forecast(fit, h = 365)

# Display forecast accuracy
accuracy(forecast_result)
                      ME      RMSE       MAE        MPE     MAPE      MASE
Training set 0.005541111 0.1262059 0.0776439 0.04752214 5.256716 0.3163514
                   ACF1
Training set 0.06016651
# Plot the forecast
autoplot(forecast_result) +
  labs(title = "ARIMA(2,1,2) Forecast",
       x = "Time",
       y = "Predicted Values") +
  theme_minimal()

# Fit models individually and check residuals
f_mean <- meanf(ts_egg, h = 200)
head(f_mean$mean) #gives the forecasted values
         Jan     Feb     Mar     Apr     May     Jun
2025 1.37203 1.37203 1.37203 1.37203 1.37203 1.37203
mean(ts_egg)
[1] 1.37203
f_naive <- naive(ts_egg, h = 200)
#checkresiduals(f_naive)

f_drift <- rwf(ts_egg, drift = TRUE, h = 200)
checkresiduals(f_drift)


    Ljung-Box test

data:  Residuals from Random walk with drift
Q* = 73.623, df = 24, p-value = 6.101e-07

Model df: 0.   Total lags used: 24
# Generate forecasts
mean_forecast <- meanf(ts_egg, h = 365)
naive_forecast <- naive(ts_egg, h = 365)
drift_forecast <- rwf(ts_egg, drift = TRUE, h = 365)
arima_forecast <- forecast(fit, h = 365)

mean_df <- data.frame(Date = time(mean_forecast$mean), Mean = as.numeric(mean_forecast$mean))
naive_df <- data.frame(Date = time(naive_forecast$mean), Naive = as.numeric(naive_forecast$mean))
drift_df <- data.frame(Date = time(drift_forecast$mean), Drift = as.numeric(drift_forecast$mean))
arima_df <- data.frame(Date = time(arima_forecast$mean), ARIMA_Fit = as.numeric(arima_forecast$mean))

ts_egg_df <- data.frame(Date = time(ts_egg), Price = as.numeric(ts_egg))

# Create Plotly plot
plot_ly() %>%
  add_lines(data = ts_egg_df, x = ~Date, y = ~Price, name = 'Original Data', line = list(color = 'black')) %>%
  add_lines(data = mean_df, x = ~Date, y = ~Mean, name = 'Mean Forecast', line = list(color = 'blue')) %>%
  add_lines(data = naive_df, x = ~Date, y = ~Naive, name = 'Naïve Forecast', line = list(color = 'red')) %>%
  add_lines(data = drift_df, x = ~Date, y = ~Drift, name = 'Drift Forecast', line = list(color = 'green')) %>%
  add_lines(data = arima_df, x = ~Date, y = ~ARIMA_Fit, name = 'ARIMA Fit', line = list(color = 'purple')) %>%
  layout(title = 'Job Postings Forecast',
         xaxis = list(title = 'Date'),
         yaxis = list(title = 'Number of Job Postings'),
         legend = list(title = list(text = 'Forecast Methods')))